1 Data loading and pre-processing

# library(lmerTest) library(clubSandwich) library(rstanarm) library(rstan)
library(tidyverse)
library(data.table)
library(purrr)
require(ggplot2)  # for plotting
require(ggsci)  # for plotting colors
require(hexbin)  # for plotting pair plots
require(bayesplot)  # for plotting Stan outputs
require(knitr)  # for Rmarkdown
require(kableExtra)  # for Rmarkdown
require(cmdstanr)  # for Stan
require(loo)  # for Bayesian LOO
require(polycor)  # esetimate polychoric correlations via ML
require(GGally)  # plotting of pair plots
dir.home <- "/Users/or105/Library/CloudStorage/OneDrive-ImperialCollegeLondon/OR_Work/2025/2025_project_Hope_Groups_Jordan"
dir.data <- file.path(dir.home, "data")
dir.out <- file.path(dir.home, "analysis-250814")
file.data <- file.path(dir.data, "PHQ_VAC_Jordan_v250812.csv")
set.seed(42L)
# Read in data
dp <- as.data.table(read.csv(file.data))
setnames(dp, c("f_participants", "facilitator_name", "Arm", "Timepoint"), c("pid",
    "fid", "arm_label", "time_label"))
dp[, arm := as.integer(arm_label == "Intervention")]
dp[, time := as.integer(time_label == "Endline")]

# Remove NA rows
dp <- subset(dp, !is.na(pid))

# Inspect participants with >2 rows and de-duplicate
dp[, table(pid)]
## pid
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   3   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   5   2   2 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  36  37  38  39  40  41 
##   2   2   3   2   2   2   2   2   2   3   2   1   2   2   2   2   2   2   2   2 
##  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61 
##   2   2   2   2   2   2   2   2   2   2   2   3   2   2   2   2   2   2   1   2 
##  62  63  64  65  66  67  68  69  70  72  73  74  75  76  77  78  79  80  81  82 
##   2   2   2   2   2   1   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 
##   2   2   2   2   2   2   2   2   2   2   2   4   2   2   1   2   2   2   2   2 
## 103 104 105 106 107 108 109 110 111 112 114 115 116 117 118 119 120 121 122 123 
##   2   2   1   2   2   3   2   2   3   2   2   2   2   2   2   2   2   2   2   2 
## 124 125 126 127 128 129 130 131 132 133 134 135 136 138 139 140 141 142 143 144 
##   2   3   2   2   2   2   2   2   2   2   1   2   2   2   2   1   1   2   2   3 
## 145 146 147 148 149 150 151 152 153 154 155 
##   2   2   2   1   1   2   2   1   1   2   2
dp <- dp[!duplicated(dp, by = c("pid", "arm", "time")), ]

# Remove participants that don't have both baseline and endline
tmp <- dp[, list(ntime = length(unique(time))), by = "pid"]
dp <- merge(dp, subset(tmp, ntime == 2, select = pid), by = "pid")

# Remove participant 13 with missing outcome
dp <- subset(dp, pid != 13)

# Selected data:
dp[, table(pid)]
## pid
##   1   2   3   4   5   6   7   8   9  10  11  12  14  15  16  17  18  19  20  21 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  22  23  24  25  26  27  28  29  30  31  33  34  36  37  38  39  40  41  42  43 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  61  62  63  64 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  65  66  68  69  70  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
##  87  88  89  90  91  92  93  94  95  96  98  99 100 101 102 103 104 106 107 108 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 109 110 111 112 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
## 130 131 132 133 135 136 138 139 142 143 144 145 146 147 150 151 154 155 
##   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2
# Set pid to continuous ID
tmp <- data.table(pid = dp[, sort(unique(pid))])
tmp[, pid_new := 1:nrow(tmp)]
dp <- merge(dp, tmp, by = "pid")
setnames(dp, c("pid", "pid_new"), c("pid_label", "pid"))

# Bring table into long format
setnames(dp, c("PHQ4_down_num", "PHQ4_interest_num", "PHQ4_nervous_num", "PHQ4_worry_num",
    "VAC_physical_total", "VAC_emotional_total", "VAC_negative_total", "v_threat_num",
    "v_refuse.speak_num", "v_insult_num", "v_shout_num", "v_abandon_num", "v_object_num",
    "v_push_num", "v_hit_num", "v_face_num", "v_beat_num"), c("PHQ4_down", "PHQ4_interest",
    "PHQ4_nervous", "PHQ4_worry", "VACP_total", "VACE_total", "VAC_total", "VACE_threat",
    "VACE_refusespeak", "VACE_insult", "VACE_shout", "VACE_abandon", "VACP_object",
    "VACP_push", "VACP_hit", "VACP_face", "VACP_beat"))
dp <- data.table::melt(dp, id.vars = c("time_label", "arm_label", "arm", "time",
    "pid", "pid_label", "fid"))

# Show value mins and maxs
dp[, list(value_min = min(value), value_max = max(value)), by = "variable"]
##             variable value_min value_max
##               <fctr>     <int>     <int>
##  1:       PHQ4_total         0        12
##  2:        PHQ4_down         0         3
##  3:    PHQ4_interest         0         3
##  4:     PHQ4_nervous         0         3
##  5:       PHQ4_worry         0         3
##  6:       VACP_total         0        15
##  7:       VACE_total         0        15
##  8:        VAC_total         0        30
##  9:      VACE_threat         0         3
## 10: VACE_refusespeak         0         3
## 11:      VACE_insult         0         3
## 12:       VACE_shout         0         3
## 13:     VACE_abandon         0         3
## 14:      VACP_object         0         3
## 15:        VACP_push         0         3
## 16:         VACP_hit         0         3
## 17:        VACP_face         0         3
## 18:        VACP_beat         0         3
# get data into long format per survey item
tmp <- unique(subset(dp, select = "variable"))
tmp[, variable_type := ifelse(grepl("total", tmp$variable), "agg_likert", "likert")]
tmp[, vid := 1:nrow(tmp)]
dp <- merge(dp, tmp, by = "variable")

2 PHQ4 - Ordered Cat model

I will start considering the PHQ4 data on the actual item scale as it was recorded. The distribution of survey responses in the Control and Intervention groups at the baseline and endline time points is as follows:

Let me next consider how participants respond across the four questions.

At each survey time point, participants tended to respond to the four questions targeting the same underlying latent behaviour (here parental mental health) in a similar way. For example, we see that most commonly, the responses to all other PHQ4 questions were the same as that for the ‘PHQ4_down’ question. We also see that participants did not respond exactly the same across all other questions. We also observe an ordering of the responses, for example participants who responded ‘Nearly every day’ to the ‘PHQ4_down’ question, responded most often ‘Nearly every day’, then ‘More than half the days’, then ‘Several days’ and then ‘Not at all’. Here is a plot that illustrates this:

These patterns are as expected: the survey deliberately asks multiple similar questions to elicit a latent underlying behavior, and we do not expect that a single question is sufficient to elicit the latent underlying behavior well.

In statistical modelling, the latent underlying behavior is called a LATENT FACTOR. A good paper on Bayesian latent factor models in psychology applications is here. Here, we will adopt simple versions of latent factor models, in that we consider ONE LATENT FACTOR for a particular set of item responses. For example, we consider four survey items, ‘PHQ_down’,‘PHQ_interest’,‘PHQ_nervous’,‘PHQ_worry’, and suppose there is one underlying latent factor ‘parental mental health’.

In our application, we suppose there is a latent factor/trait that might be unique for each participants, or identical for all participants in a certain GROUP, that determines the outcomes for each of the survey items on the Likert scale. In our case, the group is specified by participants being in the same study arm (Control or Intervention) and at the same time (baseline or endline), so we have four groups. Statistical models for such data tend to be called MULTI-GROUP LATENT FACTOR MODELS.

Furthermore, if we suppose that participants each have unique latent factor/traits, then in general we consider the latent factors EXCHANGEABLE and model participant-level latent factors through RANDOM EFFECTS around the group mean latent factor.

The above plot illustrates similarities in item responses for the same participants. For categorical and ordinal data, the state-of-the-art approach to quantify these within-group similarities are POLYCHORIC CORRELATIONS. In this approach, we envisage a latent space of continuous variables that determine the outcomes, and measure the correlations between these continuously valued variables. A good explanation and visualisation of polychoric correlations is here. Visually, we found strong similarities across survey items, and across time for each item and can quantify the strength of the similarities in terms of polychoric correlations. We find that in any group (Control/Intervention and Baseline/Endline), the polychoric correlations are strongly significant:

tmp <- data.table:::dcast( dcat1, time_label + arm_label + pid ~ variable, value.var = 'value')
tmp <- melt(tmp, id.vars = c('time_label','arm_label','pid','PHQ4_down'))
tmp[,
    {
      z <- polycor::polychor(PHQ4_down, value, std.err = TRUE, ML = FALSE)
      list(polchor = z$rho, polychor_sd = sqrt(as.numeric(z$var))) 
    }, 
    by = c('arm_label','time_label')
    ]
##       arm_label time_label   polchor polychor_sd
##          <char>     <char>     <num>       <num>
## 1:      Control   Baseline 0.5360206  0.05661704
## 2: Intervention   Baseline 0.3645951  0.06864719
## 3:      Control    Endline 0.6481489  0.04694327
## 4: Intervention    Endline 0.4283159  0.06861041

We also observe a clustering of reported outcomes across time. Participants who responded ‘Not at all’ to any of the survey questions at baseline also tended to respond ‘Not at all’, then ‘Several days’, then ‘More than half the days’ then ‘Nearly every day’ to the same question at endline. For some survey questions, polychoric correlations between baseline and endline tend to be significant, perhaps most notably for ‘PHQ4_nervous’ in the ‘Control’ group:

But I note that in our data, the temporal polychoric correlations are not as strong as the polychoric correlations across item responses at the same time point:

tmp <- data.table:::dcast( dcat1, variable + arm_label + pid ~ time_label, value.var = 'value_label')
tmp[,
    {
      z <- polycor::polychor(Baseline, Endline, std.err = TRUE, ML = FALSE)
      list(polchor = z$rho, polychor_sd = sqrt(as.numeric(z$var))) 
    }, 
    by = c('arm_label','variable')
    ]
##       arm_label      variable     polchor polychor_sd
##          <char>        <fctr>       <num>       <num>
## 1:      Control     PHQ4_down  0.37442568  0.12322211
## 2: Intervention     PHQ4_down  0.30117399  0.13374631
## 3:      Control PHQ4_interest  0.26502676  0.12987878
## 4: Intervention PHQ4_interest -0.07642404  0.13806151
## 5:      Control  PHQ4_nervous  0.54265237  0.09910591
## 6: Intervention  PHQ4_nervous  0.06361367  0.14038866
## 7:      Control    PHQ4_worry  0.31110342  0.12819115
## 8: Intervention    PHQ4_worry -0.11281099  0.14079684

2.1 Flexible Multi-Group Ordered Cat models without participant effects

We will first consider a standard, simple ordered categorical model for participants \(i\) that provide survey item responses \(Y_{ij} = \{1,\dotsc, K\}\) on a Likert scale with \(K\) possible outcomes, without participant effects.

\[\begin{align*} & \text{Prob}(Y_{ij} = k) = \text{Ordered-Logistic}(k; \eta_{ij}, c) \\ & \eta_{ij}= \beta_{G(i)}\\ & c = [c1, c1 + \text{cumsum}(\delta_{1:{K-2}})] \\ & c_1 \sim \text{Normal}(0 , 3.5^2)\\ & \delta_{1:{K-2}} \sim \text{Normal}(0 , 2.5^2)\\ & \beta \sim \text{S2Z-Normal}(0, 1) \end{align*}\] where the probabilities of each response under the Ordered Logistic model are given by \[\begin{equation*} \text{Ordered-Logistic}(k;\eta_{ij}, c) = \begin{cases} 1 - \text{logit}^{-1}(\eta_{ij} - c_1) & \text{if k = 1}\\ \text{logit}^{-1}(\eta_{ij} - c_{k-1}) - \text{logit}^{-1}(\eta_{ij} - c_k) & \text{if 1 < k < K}\\ \text{logit}^{-1}(\eta_{ij} - c_{K-1}) - 0 & \text{if k = K.} \end{cases} \end{equation*}\]

The key points of this model are: we model the probabilities of the Likert outcomes in logit space, as one would do for \(0/1\) binary outcomes. The logit space takes real values, which are transformed through the inverse logit function into probabilities in \([0,1]\). The cutpoints \(c\) divide the logit-space and determine how the logit space is segmented to correspond to \(K\) distinct probabilities. For \(K\) categories, we need \(K-1\) cutpoints and this explains why for a Bernoulli model we generally don’t need any cutpoints. In the above model, the \(G(i)\) is a function that maps participants to one of the four groups (Control/Intervention/Baseline/Endline), and \(\beta_{G(i)}\) is the one latent factor that models the strength of the underlying parental mental health of all participants in the same group. In this simple model, all participants within the same group have the same identical latent factor, there are no participant effects. There is one more important conceptual point. Since \(\eta_{ij}= \beta_{G(i)}\) does not depend on \(j\), the probability that we observe \(\{Y_{ij} = k \}\) is in this model independent of whatever we observe for all the other survey items of the same participant. This model cannot capture ‘within-participant’ clustering of item responses, and any predicted polychoric correlations will be zero under this model.

A few more technical points: putting a normal prior on the cut points themselves would mean that they tend to be pulled towards zero, and so one normally prefers to put a prior on the cut point differences. This model is quite standard and readily available in Stan and other software packages.

Here is the Stan code for the above ordered categorical model without participant effects. This model is also straightforwardly available through the ‘brms’ package.

file.prefix <- 'parentalmh_ordered_categorical_'

# this is a textbook vanilla Orderec Categorical model, without participant effects
ordered_categorical_logit_txt_m0a <- "
data
{
  int<lower=1> N; // number of observations
  int<lower=3> K; // number of categories
  int<lower=2> P; // number of groups
  array[N] int<lower=1, upper=K> y; // observations
  array [N] int<lower=1,upper=P> group_of_obs;
}
transformed data
{
    real s2z_sd_groups;
    s2z_sd_groups = inv(sqrt(1. - inv(P)));
}
parameters
{
  sum_to_zero_vector[P] beta_group;
  
  // the cutpoints in logit space, using the 'ordered' variable type
  real cut_point_1;
  vector<lower=0>[K - 2] cut_point_gaps_groups;
}
transformed parameters
{
  ordered[K-1] cut_points;
  cut_points = 
      append_row(
        rep_vector(cut_point_1, 1), 
        rep_vector(cut_point_1, K-2) + cumulative_sum(cut_point_gaps_groups)
        );
}
model
{
  // likelihood, cannot be vectorized
  for (n in 1:N) {
    y[n] ~ ordered_logistic(
      beta_group[group_of_obs[n]], 
      cut_points);
  }
  
  // priors
  beta_group ~ normal(0, s2z_sd_groups); 
  cut_point_1 ~ normal(0, 3.5);
  cut_point_gaps_groups ~ normal(0, 2.5); 
}
generated quantities
{
  matrix [K,P] ordered_prob;
  array [N] int<lower=0> ypred;
  array [N] real log_lik;
  
  for (p in 1:P) {
    ordered_prob[1,p]       = 1 - inv_logit( beta_group[p] - cut_points[1] );
    ordered_prob[2:(K-1),p] = inv_logit( beta_group[p] - cut_points[1:(K-2)] ) 
                              - 
                              inv_logit( beta_group[p] - cut_points[2:(K-1)] );
    ordered_prob[K,p]       = inv_logit( beta_group[p] - cut_points[K-1] );
  }
    
  for( n in 1:N ) {
    ypred[n] = ordered_logistic_rng( 
                beta_group[group_of_obs[n]], 
                cut_points
                ); 
    log_lik[n] =  ordered_logistic_lpmf( y[n] |
                    beta_group[group_of_obs[n]], 
                    cut_points);
  }
}
"

# compile the model
ordered_categorical_logit_filename_m0a <- cmdstanr::write_stan_file(
  gsub('\t',' ', ordered_categorical_logit_txt_m0a),
  dir = dir.out,
  basename = NULL,
  force_overwrite = FALSE,
  hash_salt = ""
)

# compile Stan model
ordered_categorical_logit_compiled_m0a <- cmdstanr::cmdstan_model(ordered_categorical_logit_filename_m0a, force_recompile = TRUE)
# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat1)
stan_data$P <- max(dcat1$group)
stan_data$K <- length(unique(dcat1$value))
stan_data$y <- dcat1$value
stan_data$group_of_obs <- dcat1$group

# sample
ordered_categorical_logit_fit_m0a <- ordered_categorical_logit_compiled_m0a$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 1 finished in 11.8 seconds.
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 finished in 12.2 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 12.0 seconds.
## Total execution time: 12.3 seconds.
# save output to RDS
ordered_categorical_logit_fit_m0a$save_object(file = file.path(dir.out, paste0(file.prefix,"m0a_stan.rds")))
# extract model probabilities
po <- ordered_categorical_logit_fit_m0a$draws(
  variables = c('ordered_prob'),
  inc_warmup = FALSE,
  format = "draws_df"
  )
po <- as.data.table(po)
setnames(po, colnames(po), gsub('\\]','',gsub('\\[','_',colnames(po))))
po <- data.table::melt(po, id.vars = c('.draw','.chain','.iteration'), value.name = 'prob')
po[, value := as.integer(gsub('ordered_prob_([0-9]),([0-9])','\\1',variable)) ]
po[, group := as.integer(gsub('ordered_prob_([0-9]),([0-9])','\\2',variable)) ]

tmp <- unique( subset(dcat1, select = c(group, arm, arm_label, time, time_label, value, value_label)) )
po <- merge(po, tmp, by = c('group','value'))

This is the model fit (in black) relative to the aggregated survey items. The model is not flexible enough to capture the proportions for each group exactly.

In the model above, there are \(P-1\) free group parameters and \(K-1\) cut point parameters, so \(9\) in total. We aim to describe \(4*4 = 16\) probabilities across the groups that must sum to \(1\), so are free to determine \(12\) values. This implies that the model above is UNDER-SPECIFIED: it aims to explain the data parsimoniously with fewer parameters compared to the complexity of our target statistics.

The primary reason for the poor model fit is that there is only one way how the logit-space is segemented to create the categorical probabilities for each of the four groups. A straightforward way to increase the expressiveness of the model is to include cut points for each of the \(4\) groups. That way, the logit space can be appropriately segmented for each group, enabling us to match the observed group-specific probabilities.

The updated model, still without participant effects, is as follows:

\[\begin{align*} & \text{Prob}(Y_{ij} = k) = \text{Ordered-Logistic}(k; \eta_{ij}, c_{G(i)}) \\ & \eta_{ij}= \beta_{G(i)}\\ & c_g = [c_{g1}, c_{g1} + \text{cumsum}(\delta_{g,1:{K-2}})] \\ & c_{g1} \sim \text{Normal}(0 , 3.5^2)\\ & \delta_{g,1:{K-2}} \sim \text{Normal}(0 , 2.5^2)\\ & \beta \sim \text{S2Z-Normal}(0, 1) \end{align*}\]

The main difference to brms code is that for each group we have enough free parameters to describe the four categorical probabilities. That is, because they need to sum to 1, there are 3 free parameters for the 4 possible Likert outcomes per group, and 12 free parameters across the 4 groups in total.

file.prefix <- 'parentalmh_ordered_categorical_'
# this is a vanilla model without unit effects
ordered_categorical_logit_txt_m0 <- "
data
{
  int<lower=1> N; // number of observations
  int<lower=3> K; // number of categories
  int<lower=2> P; // number of groups
  array[N] int<lower=1, upper=K> y; // observations
  array [N] int<lower=1,upper=P> group_of_obs;
}
transformed data
{
    real s2z_sd_groups;
    s2z_sd_groups = inv(sqrt(1. - inv(P)));
}
parameters
{
  sum_to_zero_vector[P] beta_group;
  
  // the cutpoints in logit space, using the 'ordered' variable type
  real cut_point_1;
  matrix<lower=0>[K - 2,P] cut_point_gaps_groups;
}
transformed parameters
{
  array[P] ordered[K-1] cut_points;
  for (p in 1:P)
  {
    cut_points[p] = 
      append_row(
        rep_vector(cut_point_1, 1), 
        rep_vector(cut_point_1, K-2) + cumulative_sum(cut_point_gaps_groups[,p])
        );
  }
}
model
{
  // likelihood, cannot be vectorized
  for (n in 1:N) {
    y[n] ~ ordered_logistic(
      beta_group[group_of_obs[n]], 
      cut_points[group_of_obs[n]]);
  }
  
  // priors
  beta_group ~ normal(0, s2z_sd_groups); 
  cut_point_1 ~ normal(0, 3.5); // gtools::inv.logit(2*3.5) = 0.9991
  to_vector(cut_point_gaps_groups) ~ normal(0, 2.5); // gtools::inv.logit(2*2) = 0.98
}
generated quantities
{
  matrix [K,P] ordered_prob;
  array [N] int<lower=0> ypred;
  array [N] real log_lik;
  
  for (p in 1:P) {
    ordered_prob[1,p]       = 1 - inv_logit( beta_group[p] - cut_points[p][1] );
    ordered_prob[2:(K-1),p] = inv_logit( beta_group[p] - cut_points[p][1:(K-2)] ) 
                              - 
                              inv_logit( beta_group[p] - cut_points[p][2:(K-1)] );
    ordered_prob[K,p]       = inv_logit( beta_group[p] - cut_points[p][K-1] );
  }
    
  for( n in 1:N ) {
    ypred[n] = ordered_logistic_rng( 
                beta_group[group_of_obs[n]], 
                cut_points[group_of_obs[n]]
                ); 
    log_lik[n] =  ordered_logistic_lpmf( y[n] |
                    beta_group[group_of_obs[n]], 
                    cut_points[group_of_obs[n]]);
  }
}
"

# compile the model
ordered_categorical_logit_filename_m0 <- cmdstanr::write_stan_file(
  gsub('\t',' ', ordered_categorical_logit_txt_m0),
  dir = dir.out,
  basename = NULL,
  force_overwrite = FALSE,
  hash_salt = ""
)

# compile Stan model
ordered_categorical_logit_compiled_m0 <- cmdstanr::cmdstan_model(ordered_categorical_logit_filename_m0, force_recompile = TRUE)

Let us fit this model:

# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat1)
stan_data$P <- max(dcat1$group)
stan_data$K <- length(unique(dcat1$value))
stan_data$y <- dcat1$value
stan_data$group_of_obs <- dcat1$group

# sample
ordered_categorical_logit_fit_m0 <- ordered_categorical_logit_compiled_m0$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 finished in 16.9 seconds.
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 1 finished in 17.7 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 17.3 seconds.
## Total execution time: 17.8 seconds.
# save output to RDS
ordered_categorical_logit_fit_m0$save_object(file = file.path(dir.out, paste0(file.prefix,"m0_stan.rds")))
# extract model probabilities
po <- ordered_categorical_logit_fit_m0$draws(
  variables = c('ordered_prob'),
  inc_warmup = FALSE,
  format = "draws_df"
  )
po <- as.data.table(po)
setnames(po, colnames(po), gsub('\\]','',gsub('\\[','_',colnames(po))))
po <- data.table::melt(po, id.vars = c('.draw','.chain','.iteration'), value.name = 'prob')
po[, value := as.integer(gsub('ordered_prob_([0-9]),([0-9])','\\1',variable)) ]
po[, group := as.integer(gsub('ordered_prob_([0-9]),([0-9])','\\2',variable)) ]

tmp <- unique( subset(dcat1, select = c(group, arm, arm_label, time, time_label, value, value_label)) )
po <- merge(po, tmp, by = c('group','value'))

This is the model fit (in black) relative to the aggregated survey items. Hooray, the model is flexible enough to capture the proportions for each Likert outcome and each group exactly.

This is the model fit (in black) relative to each survey item, showing the 95% posterior uncertainty interval in whiskers, the 50% posterior uncertainty interval as a box and the posterior median as a thick black line. There remains unexplained heterogeneity across each survey item. We consider this okay because each question elicits the underlying target behaviour in a slightly different way.

Here is a visualisation of the estimated polychoric correlations. We can clearly see that the model is unable to capture correlations between item responses. We will revisit this further down below.

2.2 Models with participant effects

One approach to introduce clustering of responses within the same participant is to suppose that each participant has their own latent underlying factor/trait. We can implement this by introducing random effects around the group-mean latent factor. The updated model, now with participant effects, is as follows:

\[\begin{align*} & \text{Prob}(Y_{ij} = k) = \text{Ordered-Logistic}(k; \eta_{ij}, c_{G(i)}) \\ & \eta_{ij}= \beta_{G(i)} + \gamma_i\\ & \beta \sim \text{S2Z-Normal}(0, 1)\\ & \gamma \sim \text{S2Z-Normal}(0, \sigma_\gamma^2)\\ & \sigma_\gamma \sim \text{Exponential}(2)\\ & c_g = [c_{g1}, c_{g1} + \text{cumsum}(\delta_{g,1:{K-2}})] \\ & c_{g1} \sim \text{Normal}(0 , 3.5^2)\\ & \delta_{g,1:{K-2}} \sim \text{Normal}(0 , 2.5^2) \end{align*}\]

To prepare adding participant effects to the ordered categorical model, I investigated the prior densities of the ordered categorial model in detail on various versions of the PHQ data set. It turned out that the prior on the ‘cut_point_gaps_groups’ had to be specified with a broad variance such as ‘normal(0, 2.5)’, in order for the baseline model to accurately estimate the empirical frequencies within 50% posterior credible intervals. Once this was established, I changed the priors accordingly in the previous models too.

I also wanted that the model with participant effects closely reproduces the estimates of the baseline model when each participant provides exactly one observation point so that there are no within-participant correlations. I achieved this with a ‘beta_unit_sd ~ exponential(2)’ prior on the participant random effects model. A standard ‘cauchy(0,1)’ prior failed in this regard, as it pronounced curvature in the posterior geometry that in turn seems to be associated with numerically unstable evaluations of the log likelihood.

Let me illustrate these findings. First though, here is the Stan code of the ordered categorical model with participant effects. I incorporated an Exponential prior on the participant random effect sd’s to help achieve accurate estimates when the model is overly complex/has too many free parameters, but otherwise this follows on straightforwardly from the model without participant effects above.

ordered_categorical_logit_txt_m1 <- "
data
{
  int<lower=1> N; // number of observations
  int<lower=3> K; // number of categories
  int<lower=2> P; // number of groups
  int<lower=1> U; // number of units
  array [N] int<lower=1, upper=K> y; // observations
  array [N] int<lower=1,upper=P> group_of_obs;
  array [N] int<lower=1,upper=U> unit_of_obs;
}
transformed data
{
  real s2z_sd_groups;
  real s2z_sd_unit;
  s2z_sd_groups = inv(sqrt(1. - inv(P)));
  s2z_sd_unit = inv(sqrt(1. - inv(U)));
}
parameters
{
  sum_to_zero_vector[P] beta_group;
  real<lower=0> beta_unit_sd;
  sum_to_zero_vector[U] beta_unit;
  
  // the cutpoints in logit space, using the 'ordered' variable type
  real cut_point_1;
  matrix<lower=0>[K - 2,P] cut_point_gaps_groups;
}
transformed parameters
{
  array[P] ordered[K-1] cut_points;
  for (p in 1:P)
  {
    cut_points[p] = 
      append_row(
        rep_vector(cut_point_1, 1), 
        rep_vector(cut_point_1, K-2) + cumulative_sum(cut_point_gaps_groups[,p])
        );
  }
}
model
{
  // likelihood, cannot be vectorized
  for (n in 1:N) {
    y[n] ~ ordered_logistic(
      beta_group[group_of_obs[n]] + 
        beta_unit_sd * beta_unit[unit_of_obs[n]], 
      cut_points[group_of_obs[n]]);
  }
  
  // priors
  beta_group ~ normal(0, s2z_sd_groups); 
  beta_unit ~ normal(0, s2z_sd_unit); 
  beta_unit_sd ~ exponential(2); // a +1 increase corresponds to ~ doubling of prob, 
                                 // although there is an issue in that for small probs, the same increment provides larger multipliers
                                 // gtools::inv.logit(c(-3,-2)) = 0.04742587 0.11920292
                                 // gtools::inv.logit(c(-1,0)) = 0.2689414 0.5000000
  cut_point_1 ~ normal(0, 3.5); // gtools::inv.logit(2*3.5) = 0.999
  to_vector(cut_point_gaps_groups) ~ normal(0, 2.5); // gtools::inv.logit(-3 + 2 = 1) = 0.26, empirically we know the largest next prob is ~ +0.20
}
generated quantities
{
  array [K,N] real<lower=0, upper=1> ordered_prob_by_obs;
  array [N] int<lower=0> ypred;
  array [N] real log_lik;
  vector[U] beta_unit_re;
  
  // now need to calculate ordered_prob for each observation
  {
    real tmp_real;
    for (n in 1:N) 
    {
        tmp_real = beta_group[group_of_obs[n]] + beta_unit_sd * beta_unit[unit_of_obs[n]];
        ordered_prob_by_obs[1,n]       = 1 - inv_logit( tmp_real - cut_points[group_of_obs[n]][1] );
        ordered_prob_by_obs[2:(K-1),n] = to_array_1d( inv_logit( tmp_real - cut_points[group_of_obs[n]][1:(K-2)] ) 
                                                                  - 
                                                                  inv_logit( tmp_real - cut_points[group_of_obs[n]][2:(K-1)] )
                                                                  );
        ordered_prob_by_obs[K,n]       = inv_logit( tmp_real - cut_points[group_of_obs[n]][K-1] );
    }
  }
  
  for( n in 1:N ) {
    ypred[n] = ordered_logistic_rng( 
                beta_group[group_of_obs[n]] + 
                  beta_unit_sd * beta_unit[unit_of_obs[n]], 
                cut_points[group_of_obs[n]]
                ); 
    log_lik[n] =  ordered_logistic_lpmf( y[n] |
                    beta_group[group_of_obs[n]] + 
                      beta_unit_sd * beta_unit[unit_of_obs[n]], 
                    cut_points[group_of_obs[n]]);
  }
  
  beta_unit_re = beta_unit_sd * beta_unit;
}
"

# compile the model
ordered_categorical_logit_m1_filename <- cmdstanr::write_stan_file(
  gsub('\t',' ', ordered_categorical_logit_txt_m1),
  dir = dir.out,
  basename = NULL,
  force_overwrite = FALSE,
  hash_salt = ""
)

# compile Stan model
ordered_categorical_logit_compiled_m1 <- cmdstanr::cmdstan_model(ordered_categorical_logit_m1_filename)

2.2.1 Baseline model without participant effects on endline only data

Here I engineer the situation that we have one observation per participant, using the endline data:

file.prefix <- 'parentalmh_ordered_categorical_endlineonly_priorsv2_'

# reduce data to one obs per participant
dcat1e <- subset(dcat1, time_label == 'Endline')
dcat1e[, group := group - 2L]
tmp <- dcat1e[, list(value = quantile(value, p = 0.5, type = 1)), by = c('arm','pid')]
dcat1e <- merge(dcat1e, tmp, by = c('arm','pid','value'))
dcat1e <- unique(dcat1e, by = c('arm','pid','value'))
dcat1e[, oid := 1:nrow(dcat1e)]

dcat1e[, table(pid)]
## pid
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
dcat1e[, table(arm)]
## arm
##  1  2 
## 67 71

I first fitted a model without participant effects on the endline only data as a benchmark:

2.2.2 Model with participant effects on endline only data

I next fitted the ordered categorical model with participants effects to the endline only data.

file.prefix <- 'parentalmh_ordered_categorical_endlineonly_priorsv2_'

# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat1e)
stan_data$P <- max(dcat1e$group)
stan_data$U <- max(dcat1e$pid)
stan_data$K <- length(unique(dcat1e$value))
stan_data$y <- dcat1e$value
stan_data$group_of_obs <- dcat1e$group
stan_data$unit_of_obs <- dcat1e$pid

# sample
ordered_categorical_logit_endl_fit_m1 <- ordered_categorical_logit_compiled_m1$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 1 finished in 5.4 seconds.
## Chain 2 finished in 5.4 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 5.4 seconds.
## Total execution time: 5.6 seconds.
# save output to RDS
ordered_categorical_logit_endl_fit_m1$save_object(file = file.path(dir.out, paste0(file.prefix,"m1_stan.rds")))

Here are the marginal posterior participant effects, shown by the response of the participant. We see the credible intervals are not symmetric around zero, reflecting curvature in the geometry of the joint posterior distribution. It is because of this curvature that an Exponential prior on the random effect variation improves accuracy.

Fitting the model produces the following fits. We obtain estimates that are essentially as good as those of the baseline model from the previous section, even though the model is overly flexible/has too many free parameters:

2.2.3 Model with participant effects on full data

After all these checks, let me now move on to apply the ordered categorical model with participant effects to the entire PHQ4 data.

file.prefix <- 'parentalmh_ordered_categorical_'

# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat1)
stan_data$P <- max(dcat1$group)
stan_data$U <- max(dcat1$pid)
stan_data$K <- length(unique(dcat1$value))
stan_data$y <- dcat1$value
stan_data$group_of_obs <- dcat1$group
stan_data$unit_of_obs <- dcat1$pid

# sample
ordered_categorical_logit_fit <- ordered_categorical_logit_compiled_m1$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 1 finished in 24.8 seconds.
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 finished in 24.9 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 24.9 seconds.
## Total execution time: 25.0 seconds.
# save output to RDS
ordered_categorical_logit_fit$save_object(file = file.path(dir.out, paste0(file.prefix,"m1_stan.rds")))

I illustrate the participant effects first. We observe that the model accounts for the observed clustering of responses among participants and across time by placing negative effects to individual-level responses when the typical response is response is ‘Not at all’ or ‘Several days’, and positive effects when the typical response is ‘More than half the days’ or ‘Nearly every day’.

This is the model fit (in black) relative to the aggregated survey items. This looks pretty good with regards to our target statistics, and captures some of the clustering of responses among the same participant, especially over time. However note that the \(\eta_{ij}\) term in the model only depends on \(i\), and so there the model cannot account for clustering among the responses for each participant. I will return to this in a bit.

3 PHQ4 - Poisson model

More pressingly, I will next consider the Likert outcomes as if they were counts and aggregate these across all itemized questions that target the same behaviour. The idea is that we might be able to model these “count data” with a simpler Poisson model.

3.1 Poisson model without participant effects

Here is the Poisson model without participant effects,

\[\begin{align*} & \text{Prob}(Y_{ij} = k) = \text{Poisson}(k; \lambda_{ij}) \\ & \log\lambda_{ij}= \beta_0 + \beta_{G(i)}\\ & \beta_0 \sim \text{S2Z-Normal}(0, 5^2)\\ & \beta \sim \text{S2Z-Normal}(0, 1) \end{align*}\]

and here is the corresponding Stan code:

# this is a vanilla model without unit effects
poisson_log_txt_m0 <- "
functions
{
  matrix sum2zero_generating_matrix(int K) {
    matrix[K, K] A = diag_matrix(rep_vector(1, K));
    for (i in 1:K - 1) A[K, i] = -1;
    A[K, K] = 0;
    return qr_Q(A)[ , 1:(K - 1)];
  }
}
data
{
  int<lower=1> N; // number of observations
  int<lower=2> P; // number of groups
  int<lower=1> K; // max number of counts
  array [N] int<lower=0, upper=K> y; // observations
  array [N] int<lower=1,upper=P> group_of_obs;
}
transformed data
{
    real s2z_sd_groups;
    matrix[P,P-1] s2z_Q_groups;
    
    s2z_sd_groups = inv(sqrt(1. - inv(P)));
    s2z_Q_groups = sum2zero_generating_matrix(P);
}
parameters
{
  real beta_0;
  vector[P-1] beta_groups_m1;
}
transformed parameters
{
  vector[P] beta_group;
  vector[N] log_lambda;
  beta_group =  s2z_Q_groups * beta_groups_m1;
  log_lambda = beta_0 + beta_group[group_of_obs];
}
model
{
  // likelihood
  target += poisson_log_lpmf(y | log_lambda);
  
  // priors
  target += normal_lupdf(beta_groups_m1 | 0, s2z_sd_groups); 
  target += normal_lupdf(beta_0 | 0, 5); 
}
generated quantities
{
  array [N] real log_lik;
  array [P] real log_lambda_group;
  array [N] int<lower=0> ypred;

  for( n in 1:N ) 
  {
    ypred[n] = poisson_log_rng( log_lambda[n] );
    log_lik[n] =  poisson_log_lpmf( y[n] | log_lambda[n] );
  }
  for( p in 1:P )
  {
    log_lambda_group[p] = beta_0 + beta_group[p];  
  }
}
"

# compile the model
poisson_log_filename_m0 <- cmdstanr::write_stan_file(gsub("\t", " ", poisson_log_txt_m0),
    dir = dir.out, basename = NULL, force_overwrite = FALSE, hash_salt = "")

# compile Stan model
poisson_log_compiled_m0 <- cmdstanr::cmdstan_model(poisson_log_filename_m0, force_recompile = TRUE)

The Poisson model has fewer free parameters and is much easier and faster to fit (compare the runtimes):

# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat2)
stan_data$K <- max(dcat2$value)
stan_data$P <- max(dcat2$group)
stan_data$y <- dcat2$value
stan_data$group_of_obs <- dcat2$group

# sample
poisson_log_fit_m0 <- poisson_log_compiled_m0$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  init = list(list(beta_0 = 6, beta_groups_m1 = rep(0,3)), list(beta_0 = 6, beta_groups_m1 = rep(0,3))),
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 finished in 0.4 seconds.
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 finished in 0.4 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 0.4 seconds.
## Total execution time: 0.6 seconds.
# save output to RDS
poisson_log_fit_m0$save_object(file = file.path(dir.out, paste0(file.prefix,"m0_stan.rds")))

This is the model fit to the aggregated Likert outcomes when considered as counts. We observe in this particular case here, the Poisson model does not capture the structure in the Likert outcome data at all, reflecting that it has much fewer parameters than the Ordered Categorical model.

3.2 Model comparison on non-aggregated data

We cannot directly compare the Poisson model above to the Ordered Categorical models of the previous section because they are fitted on different data sets. For this reason, I will now consider the non-aggregated responses for each survey question, but instead of interpreting these data as outcomes on a Likert scale, I will interpret these data as counts, and model these with a Poisson distribution. I will start with the model without participant effects.

file.prefix <- 'parentalmh_poisson_likert_'

# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat1)
stan_data$K <- max(dcat1$value - 1L)
stan_data$P <- max(dcat1$group)
stan_data$y <- dcat1$value - 1L
stan_data$group_of_obs <- dcat1$group

# sample
poisson_log_likert_fit_m0 <- poisson_log_compiled_m0$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  init = list(list(beta_0 = 6, beta_groups_m1 = rep(0,3)), list(beta_0 = 6, beta_groups_m1 = rep(0,3))),
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 1 finished in 1.5 seconds.
## Chain 2 finished in 1.5 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 1.5 seconds.
## Total execution time: 1.6 seconds.
# save output to RDS
poisson_log_likert_fit_m0$save_object(file = file.path(dir.out, paste0(file.prefix,"m0_stan.rds")))

This is the fit of the Poisson model. Note that the Likert outcomes are interpreted as counts 0,1,2,3 and described with a Poisson model, rather than distinct ordered scales. We can see that the Poisson fit always follows a unimodal shape that does not capture the more irregular/higher frequencies of the 2 and 3 category.

3.3 Poisson model with participant effects

The Poisson model can be easily extended to include participant effects,

\[\begin{align*} & \text{Prob}(Y_{ij} = k) = \text{Poisson}(k; \lambda_{ij}) \\ & \log\lambda_{ij}= \beta_0 + \beta_{G(i)} + \gamma_i\\ & \beta_0 \sim \text{S2Z-Normal}(0, 5^2)\\ & \beta \sim \text{S2Z-Normal}(0, 1)\\ & \gamma \sim \text{S2Z-Normal}(0, \sigma^2_\gamma)\\ & \sigma_\gamma \sim \text{Cauchy}(0,1) \end{align*}\]

Here is the corresponding Stan code:

# this is a vanilla model without unit effects
poisson_log_txt_m1 <- "
functions
{
  matrix sum2zero_generating_matrix(int K) {
    matrix[K, K] A = diag_matrix(rep_vector(1, K));
    for (i in 1:K - 1) A[K, i] = -1;
    A[K, K] = 0;
    return qr_Q(A)[ , 1:(K - 1)];
  }
}
data
{
  int<lower=1> N; // number of observations
  int<lower=2> P; // number of groups
  int<lower=1> K; // max number of counts
  array [N] int<lower=0, upper=K> y; // observations
  array [N] int<lower=1,upper=P> group_of_obs;
  int<lower=1> U; // number of units
  array [N] int<lower=1,upper=U> unit_of_obs;
}
transformed data
{
    real s2z_sd_groups;
    matrix[P,P-1] s2z_Q_groups;
    real s2z_sd_unit;
    matrix[U,U-1] s2z_Q_unit;
    array [N] int<lower=1,upper=U> unit_of_group;
    array [P] int<lower=1,upper=N> unit_of_group_n;
    array [P] int<lower=1,upper=N> unit_of_group_n_cumulated;
    int tmp;
  
    s2z_sd_groups = inv(sqrt(1. - inv(P)));
    s2z_Q_groups = sum2zero_generating_matrix(P);
    
    s2z_sd_unit = inv(sqrt(1. - inv(U)));
    s2z_Q_unit = sum2zero_generating_matrix(U);
    
    tmp = 0;
    for( p in 1:P){
      unit_of_group_n[p] = 0;
      for( n in 1:N){
        if( group_of_obs[n] == p)
        {
          unit_of_group_n[p] = unit_of_group_n[p] + 1;
          tmp = tmp + 1;
          unit_of_group[tmp] = unit_of_obs[n];
        }
      }
      unit_of_group_n_cumulated[p] = sum(unit_of_group_n[1:p]);
    }
}
parameters
{
  real beta_0;
  vector[P-1] beta_groups_m1;
  real<lower=0> beta_unit_sd;
  vector[U-1] beta_unit_m1;
}
transformed parameters
{
  vector[P] beta_group;
  vector[N] log_lambda;
  vector[U] beta_unit;
  
  beta_group =  s2z_Q_groups * beta_groups_m1;
  beta_unit =  s2z_Q_unit * beta_unit_m1;
  log_lambda = beta_0 + beta_group[group_of_obs] + beta_unit_sd * beta_unit[unit_of_obs];
}
model
{
  // likelihood
  target += poisson_log_lpmf(y | log_lambda);
  
  // priors
  target += normal_lupdf(beta_groups_m1 | 0, s2z_sd_groups); 
  target += normal_lupdf(beta_0 | 0, 5); 
  target += normal_lupdf(beta_unit_m1 | 0, s2z_sd_unit); 
  target += cauchy_lupdf(beta_unit_sd | 0, 1);
}
generated quantities
{
  array [N] real log_lik;
  array [P] real log_lambda_group;
  array [N] int<lower=0> ypred;
  real tmp_real;
  
  for( n in 1:N ) 
  {
    ypred[n] = poisson_log_rng( log_lambda[n] );
    log_lik[n] =  poisson_log_lpmf( y[n] | log_lambda[n] );
  }
  for( p in 1:P )
  {
    tmp_real = 0;
    if( unit_of_group_n[p] > 0 )
    {
      tmp_real = mean( beta_unit_sd * beta_unit[ segment(unit_of_group, p==1 ? 1 : (unit_of_group_n_cumulated[p-1] + 1), unit_of_group_n[p]) ] );
    }
    log_lambda_group[p] = beta_0 + beta_group[p] + tmp_real;  
  }
}
"

# compile the model
poisson_log_filename_m1 <- cmdstanr::write_stan_file(gsub("\t", " ", poisson_log_txt_m1),
    dir = dir.out, basename = NULL, force_overwrite = FALSE, hash_salt = "")

# compile Stan model
poisson_log_compiled_m1 <- cmdstanr::cmdstan_model(poisson_log_filename_m1, force_recompile = TRUE)

This is the fit of the Poisson model with participant effects. We see that adding participant effects does not improve model performance.

3.4 Compare Poisson to Ordered Categorical models

To quantify the differences in model fit not only by eye but also quantitatively, we consider as accuracy measure Bayesian leave-one-out expected log posterior density (Bayes-LOO-ELPD):

# Model 1 - Poisson no participant effects
# Model 2 - Poisson with participant random effects
# Model 3 - Textbook Ordered Cat, no participant effects
# Model 4 - Multi-group Ordered Cat, no participant effects
# Model 5 - Multi-group Ordered Cat, with participant random effects
comp <- loo::loo_compare(poisson_log_likert_fit_m0_loo,
                         poisson_log_likert_fit_m1_loo,
                         ordered_categorical_logit_fit_m0a_loo,
                         ordered_categorical_logit_fit_m0_loo, 
                         ordered_categorical_logit_fit_loo
                         )
print(comp, simplify = FALSE)
##        elpd_diff se_diff elpd_loo se_elpd_loo p_loo   se_p_loo looic   se_looic
## model5     0.0       0.0 -1352.3     18.4       115.5     3.1   2704.5    36.9 
## model3  -104.8      15.4 -1457.1     11.3         6.0     0.1   2914.2    22.6 
## model4  -108.8      15.3 -1461.1     11.6        12.0     0.2   2922.2    23.2 
## model2  -200.7      16.0 -1553.0     12.8        35.0     1.1   3105.9    25.7 
## model1  -235.8      19.2 -1588.0     13.1         2.7     0.1   3176.1    26.3

Woah, but also as expected from all the plots above. We see that the multi-group ordered categorical with participant effects is the top performing model in terms of Bayes-LOO-ELPD. All other models have estimated ELPD’s such that ELPD + 2 x standard errors remains below the top performing model. The difference in ELPD between the Ordered Categorical and Poisson model is very large. With this we conclude that the differences in model performances are significant. This matches the visual behavior of the model fits shown in the plots above. The Poisson models thereofore provide a worse fit to the data than the Ordered Categorical models.

4 PHQ4 - Ordered Categorical Models with response correlations

We still cannot capture any of the polychoric correlations between responses at the same time point that we observed in the data.

4.1 Model with latent MVN correlations

To fix this, we could adopt the classical model used to quantify polychoric correlations. That is, we add multivariate normal random effects for each item response, and such that the correlations are explicitly tracked in the MVN-correlation matrix. The corresponding model is

\[\begin{align*} & \text{Prob}(Y_{ij} = k) = \text{Ordered-Logistic}(k; \eta_{ij}, c_{G(i)}) \\ & \eta_{ij}= \beta_{G(i)} + \gamma_j\\ & c_g = [c_{g1}, c_{g1} + \text{cumsum}(\delta_{g,1:{K-2}})] \\ & c_{g1} \sim \text{Normal}(0 , 3.5^2)\\ & \delta_{g,1:{K-2}} \sim \text{Normal}(0 , 2.5^2)\\ & \beta \sim \text{S2Z-Normal}(0, 1)\\ & \gamma \sim \text{MVN-Normal}(0, \sigma_\gamma L L^T \sigma_\gamma^T)\\ & L \sim \text{LKJ}(2)\\ & \sigma_\gamma \sim \text{Cauchy}(0,1) \end{align*}\]

Fitting this model in a Bayesian setting is however challenging due to the unidentifiability introduced between the \(\beta_{g}\) and \(\gamma_j\). I tried this, but opted against pursueing this approach further, as it won’t scale and well and remains partly unidentifiable.

4.2 Textbook model with latent S2Z correlations

I was able to make more progress by considering fixed or random effects for each item response that are a priori independent of each other, so that the correlations are not explicitly tracked as a distinct model parameter. The primary insight here is that the joint posterior density of the response-level random effects may nonetheless capture correlations, and we can compute the correlations from the Monte Carlo samples of the joint posterior density. Additionally, we can ensure parameter identifiability by placing sum-to-zero priors on the iid response-level random effects. This model is bound to sample very well.

Here is the fixed effects version, with baseline fixed effect parameters for each survey item. I started with the textbook Ordered Categorical model that does not have group-specific cutpoints:

\[\begin{align*} & \text{Prob}(Y_{ij} = k) = \text{Ordered-Logistic}(k; \eta_{ij}, c) \\ & \eta_{ij}= \beta_{G(i)} + \gamma_j\\ & c = [c_{1}, c_{1} + \text{cumsum}(\delta_{1:{K-2}})] \\ & c_{1} \sim \text{Normal}(0 , 3.5^2)\\ & \delta_{1:{K-2}} \sim \text{Normal}(0 , 2.5^2)\\ & \beta \sim \text{S2Z-Normal}(0, 1)\\ & \gamma \sim \text{S2Z-Normal}(0,1) \end{align*}\]

Here is the corresponding Stan code:

file.prefix <- 'parentalmh_ordered_categorical_'

# this is a model with correlation among outcomes but without unit effects 
ordered_categorical_logit_txt_m4a <- "
data
{
  int<lower=1> N; // number of observations
  int<lower=3> K; // number of categories
  int<lower=2> Q; // number of questions
  int<lower=2> P; // number of groups
  array[N] int<lower=1, upper=K> y; // observations
  array [N] int<lower=1,upper=P> group_of_obs;
  array [N] int<lower=1,upper=Q> question_of_obs;
}
transformed data
{
    real s2z_sd_groups;
    real s2z_sd_questions;
    
    s2z_sd_groups = inv(sqrt(1. - inv(P)));
    s2z_sd_questions = inv(sqrt(1. - inv(Q)));
}
parameters
{
  sum_to_zero_vector[P] beta_group;
  
  // parameters for correlation in responses
  sum_to_zero_vector[Q] beta_questions;
  
  // the cutpoints in logit space, using the 'ordered' variable type
  real cut_point_1;
  vector<lower=0>[K - 2] cut_point_gaps_groups;
}
transformed parameters
{
  ordered[K-1] cut_points;
  cut_points = 
      append_row(
        cut_point_1, 
        rep_vector(cut_point_1, K-2) + cumulative_sum(cut_point_gaps_groups)
        );
}
model
{
  // likelihood, cannot be vectorized
  for (n in 1:N) {
    y[n] ~ ordered_logistic(
      beta_group[group_of_obs[n]] + beta_questions[question_of_obs[n]], 
      cut_points);
  }
  
  // priors
  beta_group ~ normal(0, s2z_sd_groups); 
  cut_point_1 ~ normal(0, 3.5); 
  cut_point_gaps_groups ~ normal(0, 2.5);
  beta_questions ~ normal(0, s2z_sd_questions); 
}
generated quantities
{
  matrix [K,P] ordered_prob;
  array [K,P,Q] real<lower=0, upper=1> ordered_prob_by_question;
  array [N] int<lower=0> ypred;
  array [N] real log_lik;
  
  for (p in 1:P) {
    ordered_prob[1,p]       = 1 - inv_logit( beta_group[p] - cut_points[1] );
    ordered_prob[2:(K-1),p] = inv_logit( beta_group[p] - cut_points[1:(K-2)] ) 
                              - 
                              inv_logit( beta_group[p] - cut_points[2:(K-1)] );
    ordered_prob[K,p]       = inv_logit( beta_group[p] - cut_points[K-1] );
  }
  
  for (q in 1:Q) {
    for (p in 1:P) {
      ordered_prob_by_question[1,p,q]       = 1 - inv_logit( beta_group[p] + beta_questions[q] - cut_points[1] );
      ordered_prob_by_question[2:(K-1),p,q] = to_array_1d( inv_logit( beta_group[p] + beta_questions[q] - cut_points[1:(K-2)] ) 
                                                           - 
                                                           inv_logit( beta_group[p] + beta_questions[q] - cut_points[2:(K-1)] )
                                                           );
      ordered_prob_by_question[K,p,q]       = inv_logit( beta_group[p] + beta_questions[q] - cut_points[K-1] );
    }
  }
    
  for( n in 1:N ) {
    ypred[n] = ordered_logistic_rng( 
                beta_group[group_of_obs[n]] + beta_questions[question_of_obs[n]], 
                cut_points
                ); 
    log_lik[n] =  ordered_logistic_lpmf( y[n] |
                    beta_group[group_of_obs[n]] + beta_questions[question_of_obs[n]], 
                    cut_points);
  }
}
"

# compile the model
ordered_categorical_logit_filename_m4a <- cmdstanr::write_stan_file(
  gsub('\t',' ', ordered_categorical_logit_txt_m4a),
  dir = dir.out,
  basename = NULL,
  force_overwrite = FALSE,
  hash_salt = ""
)

# compile Stan model
ordered_categorical_logit_compiled_m4a <- cmdstanr::cmdstan_model(ordered_categorical_logit_filename_m4a, force_recompile = TRUE)
# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat1)
stan_data$P <- max(dcat1$group)
stan_data$Q <- max(dcat1$vid)
stan_data$K <- length(unique(dcat1$value))
stan_data$y <- dcat1$value
stan_data$group_of_obs <- dcat1$group
stan_data$question_of_obs <- dcat1$vid
# sample
ordered_categorical_logit_fit_m4a <- ordered_categorical_logit_compiled_m4a$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 finished in 11.4 seconds.
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 1 finished in 12.4 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 11.9 seconds.
## Total execution time: 12.6 seconds.
# save output to RDS
ordered_categorical_logit_fit_m4a$save_object(file = file.path(dir.out, paste0(file.prefix,"m4a_stan.rds")))

We can illustrate the posterior correlations by plotting the shape of the posterior density of the linear predictor \(\eta_{ij}\) across the four item responses. It is hard to plot a 4D density, but we can easily visualise all pairwise combinations of the 4D density across the four groups (shown in colours, lower triangular part). We clearly see that the model estimates polychoric correlations at a similar level as we observe in the data (upper triangular part).

This is the model fit (in black) relative to the aggregated survey items. Of course, this textbook model has the same deficiencies as the simpler textbook model further above.

4.3 Multi-group model with S2Z correlations

We can of course easily extend the textbook model as elaborated further above,

\[\begin{align*} & \text{Prob}(Y_{ij} = k) = \text{Ordered-Logistic}(k; \eta_{ij}, c_{G(i)}) \\ & \eta_{ij}= \beta_{G(i)} + \gamma_j\\ & c_g = [c_{g1}, c_{g1} + \text{cumsum}(\delta_{g,1:{K-2}})] \\ & c_{g1} \sim \text{Normal}(0 , 3.5^2)\\ & \delta_{g,1:{K-2}} \sim \text{Normal}(0 , 2.5^2)\\ & \beta \sim \text{S2Z-Normal}(0, 1)\\ & \gamma \sim \text{S2Z-Normal}(0,1) \end{align*}\]

Here is the Stan code:

file.prefix <- 'parentalmh_ordered_categorical_'

# this is a model with correlation among outcomes but without unit effects 
ordered_categorical_logit_txt_m4 <- "
data
{
  int<lower=1> N; // number of observations
  int<lower=3> K; // number of categories
  int<lower=2> Q; // number of questions
  int<lower=2> P; // number of groups
  array[N] int<lower=1, upper=K> y; // observations
  array [N] int<lower=1,upper=P> group_of_obs;
  array [N] int<lower=1,upper=Q> question_of_obs;
}
transformed data
{
    real s2z_sd_groups;
    real s2z_sd_questions;
    
    s2z_sd_groups = inv(sqrt(1. - inv(P)));
    s2z_sd_questions = inv(sqrt(1. - inv(Q)));
}
parameters
{
  sum_to_zero_vector[P] beta_group;
  
  // parameters for correlation in responses
  sum_to_zero_vector[Q] beta_questions;
  
  // the cutpoints in logit space, using the 'ordered' variable type
  real cut_point_1;
  matrix<lower=0>[K - 2,P] cut_point_gaps_groups;
}
transformed parameters
{
  array[P] ordered[K-1] cut_points;
  for (p in 1:P)
  {
    cut_points[p] = 
      append_row(
        rep_vector(cut_point_1, 1), 
        rep_vector(cut_point_1, K-2) + cumulative_sum(cut_point_gaps_groups[,p])
        );
  }
}
model
{
  // likelihood, cannot be vectorized
  for (n in 1:N) {
    y[n] ~ ordered_logistic(
      beta_group[group_of_obs[n]] + beta_questions[question_of_obs[n]], 
      cut_points[group_of_obs[n]]);
  }
  
  // priors
  beta_group ~ normal(0, s2z_sd_groups); 
  cut_point_1 ~ normal(0, 3.5); 
  to_vector(cut_point_gaps_groups) ~ normal(0, 2.5);
  beta_questions ~ normal(0, s2z_sd_questions); 
}
generated quantities
{
  matrix [K,P] ordered_prob;
  array [K,P,Q] real<lower=0, upper=1> ordered_prob_by_question;
  array [N] int<lower=0> ypred;
  array [N] real log_lik;
  
  for (p in 1:P) {
    ordered_prob[1,p]       = 1 - inv_logit( beta_group[p] - cut_points[p][1] );
    ordered_prob[2:(K-1),p] = inv_logit( beta_group[p] - cut_points[p][1:(K-2)] ) 
                              - 
                              inv_logit( beta_group[p] - cut_points[p][2:(K-1)] );
    ordered_prob[K,p]       = inv_logit( beta_group[p] - cut_points[p][K-1] );
  }
  
  for (q in 1:Q) {
    for (p in 1:P) {
      ordered_prob_by_question[1,p,q]       = 1 - inv_logit( beta_group[p] + beta_questions[q] - cut_points[p][1] );
      ordered_prob_by_question[2:(K-1),p,q] = to_array_1d( inv_logit( beta_group[p] + beta_questions[q] - cut_points[p][1:(K-2)] ) 
                                                           - 
                                                           inv_logit( beta_group[p] + beta_questions[q] - cut_points[p][2:(K-1)] )
                                                           );
      ordered_prob_by_question[K,p,q]       = inv_logit( beta_group[p] + beta_questions[q] - cut_points[p][K-1] );
    }
  }
    
  for( n in 1:N ) {
    ypred[n] = ordered_logistic_rng( 
                beta_group[group_of_obs[n]] + beta_questions[question_of_obs[n]], 
                cut_points[group_of_obs[n]]
                ); 
    log_lik[n] =  ordered_logistic_lpmf( y[n] |
                    beta_group[group_of_obs[n]] + beta_questions[question_of_obs[n]], 
                    cut_points[group_of_obs[n]]);
  }
}
"

# compile the model
ordered_categorical_logit_filename_m4 <- cmdstanr::write_stan_file(
  gsub('\t',' ', ordered_categorical_logit_txt_m4),
  dir = dir.out,
  basename = NULL,
  force_overwrite = FALSE,
  hash_salt = ""
)

# compile Stan model
ordered_categorical_logit_compiled_m4 <- cmdstanr::cmdstan_model(ordered_categorical_logit_filename_m4, force_recompile = TRUE)

Let’s run this model too:

# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat1)
stan_data$P <- max(dcat1$group)
stan_data$Q <- max(dcat1$vid)
stan_data$K <- length(unique(dcat1$value))
stan_data$y <- dcat1$value
stan_data$group_of_obs <- dcat1$group
stan_data$question_of_obs <- dcat1$vid
# sample
ordered_categorical_logit_fit_m4 <- ordered_categorical_logit_compiled_m4$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 1 finished in 17.2 seconds.
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 finished in 18.0 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 17.6 seconds.
## Total execution time: 18.1 seconds.
# save output to RDS
ordered_categorical_logit_fit_m4$save_object(file = file.path(dir.out, paste0(file.prefix,"m4_stan.rds")))

This is the model fit (in black) relative to the aggregated survey items. The model is flexible enough to capture the proportions for each Likert outcome and each group exactly, and matches the polychoric correlations between item responses (not shown, plot is essentially as for the textbook model above).

Let us also quantify that accounting for polychoric correlations indeed improves model fit in terms of Bayes-LOO-ELPD:

# Model 1 - Poisson no participant effects
# Model 2 - Poisson with participant random effects
# Model 3 - Textbook Ordered Cat, no participant effects
# Model 4 - Multi-group Ordered Cat, no participant effects
# Model 5 - Multi-group Ordered Cat, with participant random effects
# Model 6 - Textbook Ordered Cat with response correlations, no participant effects
# Model 7 - Multi-group Ordered Cat with response correlations, no participant effects
comp <- loo::loo_compare(poisson_log_likert_fit_m0_loo,
                         poisson_log_likert_fit_m1_loo,
                         ordered_categorical_logit_fit_m0a_loo,
                         ordered_categorical_logit_fit_m0_loo, 
                         ordered_categorical_logit_fit_loo,
                         ordered_categorical_logit_fit_m4a_loo,
                         ordered_categorical_logit_fit_m4_loo
                         )
print(comp, simplify = FALSE)
##        elpd_diff se_diff elpd_loo se_elpd_loo p_loo   se_p_loo looic   se_looic
## model5     0.0       0.0 -1352.3     18.4       115.5     3.1   2704.5    36.9 
## model6   -93.1      16.7 -1445.4     12.3         8.8     0.2   2890.7    24.7 
## model7   -97.4      16.6 -1449.7     12.6        15.0     0.3   2899.3    25.2 
## model3  -104.8      15.4 -1457.1     11.3         6.0     0.1   2914.2    22.6 
## model4  -108.8      15.3 -1461.1     11.6        12.0     0.2   2922.2    23.2 
## model2  -200.7      16.0 -1553.0     12.8        35.0     1.1   3105.9    25.7 
## model1  -235.8      19.2 -1588.0     13.1         2.7     0.1   3176.1    26.3

5 PHQ4 - Ordered Cat Latent Factor model

5.1 Textbook model

I can summarize all above outcomes with a single sentence: each improvement has brought us closer to a standard latent factor model. So let us investigate a standard latent factor model too. I will start with the textbook version, one cutpoint vector across groups. We don’t expect that this model fits well, but it is important to understand if we could get satisfactory outputs just with BRMS.

\[\begin{align*} & \text{Prob}(Y_{ij} = k) = \text{Ordered-Logistic}(k; \eta_{ij}, c) \\ & \eta_{ij}= \gamma_j + \lambda_j * (\beta_{G(i)} + \beta^{\text{RE}}_i)\\ & \gamma \sim \text{S2Z-Normal}(0,1)\\ & \lambda_1 = 1\\ & \lambda_{2:J} \sim \text{Normal}(0,1)\\ & \beta \sim \text{S2Z-Normal}(0, 1)\\ & \beta^{\text{RE}} \sim \text{S2Z-Normal}(0,1)\\ & c = [c_{1}, c_{1} + \text{cumsum}(\delta_{1:{K-2}})] \\ & c_{1} \sim \text{Normal}(0 , 3.5^2)\\ & \delta_{1:{K-2}} \sim \text{Normal}(0 , 2.5^2)\\ \end{align*}\]

The key points of the latent factor model are: (1) we consider each participant is now associated with its own latent factor \(\beta_{G(i)} + \beta^{\text{RE}}_i\). These latent factors are modeled through participant-level random effects around a group mean, as further above. (2) We also retain the baseline item effects \(\gamma_j\) to capture correlations across items as above. These item effects also enable us to explain data where latent factors are not needed at all to explain the data. (3) We only added the factor loadings \(\lambda_j\). These essentially provide the model with the ability to link the categorical probabilities to the latent factors through linear functions that also have a slope and are not just horizontal. So this is pretty much the minimum complexity that one would like to consider. The first loading is set to 1 to ensure that the loadings and factors are identifiable.

Here is the Stan code:

ordered_categorical_logit_txt_m5a <- "
data
{
  int<lower=1> N; // number of observations
  int<lower=3> K; // number of categories
  int<lower=2> Q; // number of questions
  int<lower=2> P; // number of groups
  int<lower=1> U; // number of units
  array [N] int<lower=1, upper=K> y; // observations
  array [N] int<lower=1,upper=P> group_of_obs;
  array [N] int<lower=1,upper=Q> question_of_obs;
  array [N] int<lower=1,upper=U> unit_of_obs;
}
transformed data
{
    real s2z_sd_groups;
    real s2z_sd_unit;
    real s2z_sd_questions;
    s2z_sd_groups = inv(sqrt(1. - inv(P)));
    s2z_sd_unit = inv(sqrt(1. - inv(U)));
    s2z_sd_questions = inv(sqrt(1. - inv(Q)));
}
parameters
{
  sum_to_zero_vector[P] factor_group;
  sum_to_zero_vector[Q] beta_questions;
  vector<lower=0>[Q-1] loadings_questions_m1;
  sum_to_zero_vector[U] beta_unit;
  real<lower=0> beta_unit_sd;
  
  // the cutpoints in logit space, using the 'ordered' variable type
  real cut_point_1;
  vector<lower=0>[K - 2] cut_point_gaps_groups;
}
transformed parameters
{
  ordered[K-1] cut_points;
  vector<lower=0>[Q] loadings_questions;
  
  cut_points = 
      append_row(
        cut_point_1, 
        rep_vector(cut_point_1, K-2) + cumulative_sum(cut_point_gaps_groups)
        );
  
  loadings_questions = append_row(1.0, loadings_questions_m1);
}
model
{
  // likelihood, cannot be vectorized
  for (n in 1:N) {
    y[n] ~ ordered_logistic(
      beta_questions[question_of_obs[n]] + 
        loadings_questions[question_of_obs[n]] *
        (
          factor_group[group_of_obs[n]] + 
          beta_unit_sd * beta_unit[unit_of_obs[n]]
        ), 
      cut_points);
  }
  
  // priors
  factor_group ~ normal(0, s2z_sd_groups); 
  beta_unit ~ normal(0, s2z_sd_unit); 
  beta_questions ~ normal(0, s2z_sd_questions); 
  loadings_questions_m1 ~ lognormal(0,1);
  beta_unit_sd ~ exponential(2); 
  cut_point_1 ~ normal(0, 3.5); 
  cut_point_gaps_groups ~ normal(0, 2.5); 
}
generated quantities
{
  array [P,Q] real mean_eta;
  array [K,N] real<lower=0, upper=1> ordered_prob_by_obs;
  array [N] int<lower=0> ypred;
  array [N] real log_lik;
  vector[U] beta_unit_re;
  
  for (q in 1:Q) 
  {
    for (p in 1:P) 
    {
      mean_eta[p,q] = beta_questions[q] + loadings_questions[q] * factor_group[p];
    }
  }
  
  {
    real tmp_real;
    for (n in 1:N) 
    {
        tmp_real = beta_questions[question_of_obs[n]] + 
                    loadings_questions[question_of_obs[n]] *
                    (
                      factor_group[group_of_obs[n]] + 
                      beta_unit_sd * beta_unit[unit_of_obs[n]]
                    );
        ordered_prob_by_obs[1,n]       = 1 - inv_logit( tmp_real - cut_points[1] );
        ordered_prob_by_obs[2:(K-1),n] = to_array_1d( inv_logit( tmp_real - cut_points[1:(K-2)] ) 
                                                              - 
                                                              inv_logit( tmp_real - cut_points[2:(K-1)] )
                                                              );
        ordered_prob_by_obs[K,n]       = inv_logit( tmp_real - cut_points[K-1] );
    }
  }
  
  for( n in 1:N ) 
  {
    ypred[n] = ordered_logistic_rng( 
                beta_questions[question_of_obs[n]] + 
                  loadings_questions[question_of_obs[n]] *
                  (
                    factor_group[group_of_obs[n]] + 
                    beta_unit_sd * beta_unit[unit_of_obs[n]]
                  ), 
                cut_points
                ); 
    log_lik[n] =  ordered_logistic_lpmf( y[n] |
                    beta_questions[question_of_obs[n]] + 
                      loadings_questions[question_of_obs[n]] *
                      (
                        factor_group[group_of_obs[n]] + 
                        beta_unit_sd * beta_unit[unit_of_obs[n]]
                      ),
                    cut_points);
  }
  
  beta_unit_re = beta_unit_sd * beta_unit;
}
"

# compile the model
ordered_categorical_logit_m5a_filename <- cmdstanr::write_stan_file(
  gsub('\t',' ', ordered_categorical_logit_txt_m5a),
  dir = dir.out,
  basename = NULL,
  force_overwrite = FALSE,
  hash_salt = ""
)

# compile Stan model
ordered_categorical_logit_compiled_m5a <- cmdstanr::cmdstan_model(ordered_categorical_logit_m5a_filename)

Let’s fit the model!

file.prefix <- 'parentalmh_ordered_categorical_'

# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat1)
stan_data$P <- max(dcat1$group)
stan_data$Q <- max(dcat1$vid)
stan_data$U <- max(dcat1$pid)
stan_data$K <- length(unique(dcat1$value))
stan_data$y <- dcat1$value
stan_data$group_of_obs <- dcat1$group
stan_data$question_of_obs <- dcat1$vid
stan_data$unit_of_obs <- dcat1$pid

# sample
ordered_categorical_logit_fit_m5a <- ordered_categorical_logit_compiled_m5a$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 1 finished in 28.6 seconds.
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 finished in 28.6 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 28.6 seconds.
## Total execution time: 28.8 seconds.
# save output to RDS
ordered_categorical_logit_fit_m5a$save_object(file = file.path(dir.out, paste0(file.prefix,"m5a_stan.rds")))

Here are the participant effects around the group mean latent factors, showing the same behaviour as before. This demonstrates that (1) the added baseline response effects are not able to explain the data without latent traits, and (2) there is substantial individual-level heterogeneity around the group mean latent factors.

Here is the polychoric correlation plot. We ignore the \(\beta_i^{\text{RND}}\) latent factor random effects, and can then plot the group-mean multivariate structure of the joint probabilities in logit-space:

Here is the model fit (in black) relative to the aggregated survey items. As before, the fit is not great relative to our primary target statistics:

5.2 First multi-group model

Next up, I expanded the 1D latent factor model to include cutpoints for each group, as in the above multi-group models.

\[\begin{align*} & \text{Prob}(Y_{ij} = k) = \text{Ordered-Logistic}(k; \eta_{ij}, c_{G(i)}) \\ & \eta_{ij}= \gamma_j + \lambda_j * (\beta_{G(i)} + \beta^{\text{RE}}_i)\\ & \gamma \sim \text{S2Z-Normal}(0,1)\\ & \lambda_1 = 1\\ & \lambda_{2:J} \sim \text{Normal}(0,1)\\ & \beta \sim \text{S2Z-Normal}(0, 1)\\ & \beta^{\text{RE}} \sim \text{S2Z-Normal}(0,1)\\ & c_g = [c_{g1}, c_{g1} + \text{cumsum}(\delta_{g,1:{K-2}})] \\ & c_{g1} \sim \text{Normal}(0 , 3.5^2)\\ & \delta_{g,1:{K-2}} \sim \text{Normal}(0 , 2.5^2)\\ \end{align*}\]

Here is the Stan code:

ordered_categorical_logit_txt_m5 <- "
data
{
  int<lower=1> N; // number of observations
  int<lower=3> K; // number of categories
  int<lower=2> Q; // number of questions
  int<lower=2> P; // number of groups
  int<lower=1> U; // number of units
  array [N] int<lower=1, upper=K> y; // observations
  array [N] int<lower=1,upper=P> group_of_obs;
  array [N] int<lower=1,upper=Q> question_of_obs;
  array [N] int<lower=1,upper=U> unit_of_obs;
}
transformed data
{
    real s2z_sd_groups;
    real s2z_sd_unit;
    real s2z_sd_questions;
    s2z_sd_groups = inv(sqrt(1. - inv(P)));
    s2z_sd_unit = inv(sqrt(1. - inv(U)));
    s2z_sd_questions = inv(sqrt(1. - inv(Q)));
}
parameters
{
  sum_to_zero_vector[P] factor_group;
  sum_to_zero_vector[Q] beta_questions;
  vector<lower=0>[Q-1] loadings_questions_m1;
  sum_to_zero_vector[U] beta_unit;
  real<lower=0> beta_unit_sd;
  
  // the cutpoints in logit space, using the 'ordered' variable type
  real cut_point_1;
  matrix<lower=0>[K - 2,P] cut_point_gaps_groups;
}
transformed parameters
{
  array[P] ordered[K-1] cut_points;
  vector<lower=0>[Q] loadings_questions;
  
  for (p in 1:P)
  {
    cut_points[p] = 
      append_row(
        rep_vector(cut_point_1, 1), 
        rep_vector(cut_point_1, K-2) + cumulative_sum(cut_point_gaps_groups[,p])
        );
  }
  
  loadings_questions = append_row(1.0, loadings_questions_m1);
}
model
{
  // likelihood, cannot be vectorized
  for (n in 1:N) {
    y[n] ~ ordered_logistic(
      beta_questions[question_of_obs[n]] + 
        loadings_questions[question_of_obs[n]] *
        (
          factor_group[group_of_obs[n]] + 
          beta_unit_sd * beta_unit[unit_of_obs[n]]
        ), 
      cut_points[group_of_obs[n]]);
  }
  
  // priors
  factor_group ~ normal(0, s2z_sd_groups); 
  beta_unit ~ normal(0, s2z_sd_unit); 
  beta_questions ~ normal(0, s2z_sd_questions); 
  loadings_questions_m1 ~ lognormal(0,1);
  beta_unit_sd ~ exponential(2); // a +1 increase corresponds to ~ doubling of prob, 
                                 // although there is an issue in that for small probs, the same increment provides larger multipliers
                                 // gtools::inv.logit(c(-3,-2)) = 0.04742587 0.11920292
                                 // gtools::inv.logit(c(-1,0)) = 0.2689414 0.5000000
  cut_point_1 ~ normal(0, 3.5); // gtools::inv.logit(2*3.5) = 0.999
  to_vector(cut_point_gaps_groups) ~ normal(0, 2.5); // gtools::inv.logit(-3 + 2 = 1) = 0.26, empirically we know the largest next prob is ~ +0.20
}
generated quantities
{
  array [P,Q] real mean_eta;
  array [K,N] real<lower=0, upper=1> ordered_prob_by_obs;
  array [N] int<lower=0> ypred;
  array [N] real log_lik;
  vector[U] beta_unit_re;
  
  for (q in 1:Q) 
  {
    for (p in 1:P) 
    {
      mean_eta[p,q] = beta_questions[q] + loadings_questions[q] * factor_group[p];
    }
  }
  
  {
    real tmp_real;
    for (n in 1:N) 
    {
        tmp_real = beta_questions[question_of_obs[n]] + 
                    loadings_questions[question_of_obs[n]] *
                    (
                      factor_group[group_of_obs[n]] + 
                      beta_unit_sd * beta_unit[unit_of_obs[n]]
                    );
        ordered_prob_by_obs[1,n]       = 1 - inv_logit( tmp_real - cut_points[group_of_obs[n]][1] );
        ordered_prob_by_obs[2:(K-1),n] = to_array_1d( inv_logit( tmp_real - cut_points[group_of_obs[n]][1:(K-2)] ) 
                                                              - 
                                                              inv_logit( tmp_real - cut_points[group_of_obs[n]][2:(K-1)] )
                                                              );
        ordered_prob_by_obs[K,n]       = inv_logit( tmp_real - cut_points[group_of_obs[n]][K-1] );
    }
  }
  
  for( n in 1:N ) 
  {
    ypred[n] = ordered_logistic_rng( 
                beta_questions[question_of_obs[n]] + 
                  loadings_questions[question_of_obs[n]] *
                  (
                    factor_group[group_of_obs[n]] + 
                    beta_unit_sd * beta_unit[unit_of_obs[n]]
                  ), 
                cut_points[group_of_obs[n]]
                ); 
    log_lik[n] =  ordered_logistic_lpmf( y[n] |
                    beta_questions[question_of_obs[n]] + 
                      loadings_questions[question_of_obs[n]] *
                      (
                        factor_group[group_of_obs[n]] + 
                        beta_unit_sd * beta_unit[unit_of_obs[n]]
                      ),
                    cut_points[group_of_obs[n]]);
  }
  
  beta_unit_re = beta_unit_sd * beta_unit;
}
"

# compile the model
ordered_categorical_logit_m5_filename <- cmdstanr::write_stan_file(
  gsub('\t',' ', ordered_categorical_logit_txt_m5),
  dir = dir.out,
  basename = NULL,
  force_overwrite = FALSE,
  hash_salt = ""
)

# compile Stan model
ordered_categorical_logit_compiled_m5 <- cmdstanr::cmdstan_model(ordered_categorical_logit_m5_filename)

Let’s fit the model!

file.prefix <- 'parentalmh_ordered_categorical_'

# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat1)
stan_data$P <- max(dcat1$group)
stan_data$Q <- max(dcat1$vid)
stan_data$U <- max(dcat1$pid)
stan_data$K <- length(unique(dcat1$value))
stan_data$y <- dcat1$value
stan_data$group_of_obs <- dcat1$group
stan_data$question_of_obs <- dcat1$vid
stan_data$unit_of_obs <- dcat1$pid

# sample
ordered_categorical_logit_fit_m5 <- ordered_categorical_logit_compiled_m5$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 finished in 27.9 seconds.
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 1 finished in 28.2 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 28.1 seconds.
## Total execution time: 28.3 seconds.
# save output to RDS
ordered_categorical_logit_fit_m5$save_object(file = file.path(dir.out, paste0(file.prefix,"m5_stan.rds")))

Here are the participant effects around the group mean latent factors, showing the same behaviour as before. This demonstrates that (1) the added baseline response effects are not able to explain the data without latent traits, and (2) there is substantial individual-level heterogeneity around the group mean latent factors.

Here is the polychoric correlation plot. We ignore the \(\beta_i^{\text{RND}}\) latent factor random effects, and can then plot the group-mean multivariate structure of the joint probabilities in logit-space:

Here is the model fit (in black) relative to the aggregated survey items. Hoorary, we retain a good fit relative to our primary target statistics:

Here is the model fit (in black) relative to each survey item. We don’t achieve a good fit. While we previously we considered this okay, we might like that a factor model, which aims to model the responses for each survey item, does a better job here.

Here is a visualisation of the estimated polychoric correlations. We can clearly see how the model is inducing some correlations between item responses:

5.3 Second multi-group model

Next up, I will investigate a more expressive multi-group model with group-specific item baselines and group-specific loadings. I intentionally do not introducce group-specific participant effects, because the same participants are present at Baseline and Endline.

\[\begin{align*} & \text{Prob}(Y_{ij} = k) = \text{Ordered-Logistic}(k; \eta_{ij}, c_{G(i)}) \\ & \eta_{ij}= \gamma_{G(i),j} + \lambda_{G(i),j} * (\beta_{G(i)} + \beta^{\text{RE}}_i)\\ & \gamma_g \sim \text{S2Z-Normal}(0,1)\\ & \lambda_{g,1} = 1\\ & \lambda_{g,2:J} \sim \text{Normal}(0,1)\\ & \beta \sim \text{S2Z-Normal}(0, 1)\\ & \beta^{\text{RE}} \sim \text{S2Z-Normal}(0,1)\\ & c_g = [c_{g1}, c_{g1} + \text{cumsum}(\delta_{g,1:{K-2}})] \\ & c_{g1} \sim \text{Normal}(0 , 3.5^2)\\ & \delta_{g,1:{K-2}} \sim \text{Normal}(0 , 2.5^2)\\ \end{align*}\]

Here is the Stan code:

ordered_categorical_logit_txt_m5b <- "
data
{
  int<lower=1> N; // number of observations
  int<lower=3> K; // number of categories
  int<lower=2> Q; // number of questions
  int<lower=2> P; // number of groups
  int<lower=1> U; // number of units
  array [N] int<lower=1, upper=K> y; // observations
  array [N] int<lower=1,upper=P> group_of_obs;
  array [N] int<lower=1,upper=Q> question_of_obs;
  array [N] int<lower=1,upper=U> unit_of_obs;
}
transformed data
{
    real s2z_sd_groups;
    real s2z_sd_unit;
    real s2z_sd_questions;
    s2z_sd_groups = inv(sqrt(1. - inv(P)));
    s2z_sd_unit = inv(sqrt(1. - inv(U)));
    s2z_sd_questions = inv(sqrt(1. - inv(Q)));
}
parameters
{
  sum_to_zero_vector[P] factor_group;
  array [P] sum_to_zero_vector[Q] beta_questions;
  array [P] vector<lower=0>[Q-1] loadings_questions_m1;
  sum_to_zero_vector[U] beta_unit;
  real<lower=0> beta_unit_sd;
  
  // the cutpoints in logit space, using the 'ordered' variable type
  real cut_point_1;
  matrix<lower=0>[K - 2,P] cut_point_gaps_groups;
}
transformed parameters
{
  array [P] ordered[K-1] cut_points;
  array [P] vector<lower=0>[Q] loadings_questions;
  
  for (p in 1:P)
  {
    cut_points[p] = 
      append_row(
        rep_vector(cut_point_1, 1), 
        rep_vector(cut_point_1, K-2) + cumulative_sum(cut_point_gaps_groups[,p])
        );
        
    loadings_questions[p] = append_row(1.0, loadings_questions_m1[p]);
  }
}
model
{
  // likelihood, cannot be vectorized
  for (n in 1:N) {
    y[n] ~ ordered_logistic(
      beta_questions[group_of_obs[n]][question_of_obs[n]] + 
        loadings_questions[group_of_obs[n]][question_of_obs[n]] *
        (
          factor_group[group_of_obs[n]] + 
          beta_unit_sd * beta_unit[unit_of_obs[n]]
        ), 
      cut_points[group_of_obs[n]]);
  }
  
  // priors
  factor_group ~ normal(0, s2z_sd_groups); 
  beta_unit ~ normal(0, s2z_sd_unit); 
  for (p in 1:P)
  {
    beta_questions[p] ~ normal(0, s2z_sd_questions); 
    loadings_questions_m1[p] ~ lognormal(0,1);
  }
  beta_unit_sd ~ exponential(2); 
  cut_point_1 ~ normal(0, 3.5); 
  to_vector(cut_point_gaps_groups) ~ normal(0, 2.5); 
}
generated quantities
{
  array [P,Q] real mean_eta;
  array [K,N] real<lower=0, upper=1> ordered_prob_by_obs;
  array [N] int<lower=0> ypred;
  array [N] real log_lik;
  vector[U] beta_unit_re;
  
  for (q in 1:Q) 
  {
    for (p in 1:P) 
    {
      mean_eta[p,q] = beta_questions[p][q] + loadings_questions[p][q] * factor_group[p];
    }
  }
  
  {
    real tmp_real;
    for (n in 1:N) 
    {
        tmp_real = beta_questions[group_of_obs[n]][question_of_obs[n]] + 
                    loadings_questions[group_of_obs[n]][question_of_obs[n]] *
                    (
                      factor_group[group_of_obs[n]] + 
                      beta_unit_sd * beta_unit[unit_of_obs[n]]
                    );
        ordered_prob_by_obs[1,n]       = 1 - inv_logit( tmp_real - cut_points[group_of_obs[n]][1] );
        ordered_prob_by_obs[2:(K-1),n] = to_array_1d( inv_logit( tmp_real - cut_points[group_of_obs[n]][1:(K-2)] ) 
                                                              - 
                                                              inv_logit( tmp_real - cut_points[group_of_obs[n]][2:(K-1)] )
                                                              );
        ordered_prob_by_obs[K,n]       = inv_logit( tmp_real - cut_points[group_of_obs[n]][K-1] );
    }
  }
  
  for( n in 1:N ) 
  {
    ypred[n] = ordered_logistic_rng( 
                beta_questions[group_of_obs[n]][question_of_obs[n]] + 
                  loadings_questions[group_of_obs[n]][question_of_obs[n]] *
                  (
                    factor_group[group_of_obs[n]] + 
                    beta_unit_sd * beta_unit[unit_of_obs[n]]
                  ), 
                cut_points[group_of_obs[n]]
                ); 
    log_lik[n] =  ordered_logistic_lpmf( y[n] |
                    beta_questions[group_of_obs[n]][question_of_obs[n]] + 
                      loadings_questions[group_of_obs[n]][question_of_obs[n]] *
                      (
                        factor_group[group_of_obs[n]] + 
                        beta_unit_sd * beta_unit[unit_of_obs[n]]
                      ),
                    cut_points[group_of_obs[n]]);
  }
  
  beta_unit_re = beta_unit_sd * beta_unit;
}
"

# compile the model
ordered_categorical_logit_m5b_filename <- cmdstanr::write_stan_file(
  gsub('\t',' ', ordered_categorical_logit_txt_m5b),
  dir = dir.out,
  basename = NULL,
  force_overwrite = FALSE,
  hash_salt = ""
)

# compile Stan model
ordered_categorical_logit_compiled_m5b <- cmdstanr::cmdstan_model(ordered_categorical_logit_m5b_filename)

Let’s fit the model!

file.prefix <- 'parentalmh_ordered_categorical_'

# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat1)
stan_data$P <- max(dcat1$group)
stan_data$Q <- max(dcat1$vid)
stan_data$U <- max(dcat1$pid)
stan_data$K <- length(unique(dcat1$value))
stan_data$y <- dcat1$value
stan_data$group_of_obs <- dcat1$group
stan_data$question_of_obs <- dcat1$vid
stan_data$unit_of_obs <- dcat1$pid

# sample
ordered_categorical_logit_fit_m5b <- ordered_categorical_logit_compiled_m5b$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 1 finished in 36.0 seconds.
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 finished in 39.1 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 37.5 seconds.
## Total execution time: 39.2 seconds.
# save output to RDS
ordered_categorical_logit_fit_m5b$save_object(file = file.path(dir.out, paste0(file.prefix,"m5b_stan.rds")))

Here is the model fit (in black) relative to the each survey item. We still don’t achieve a good fit, I believe this must be coming from the temporal constraints imposed by participant effects across Baseline and Endline. It s puzzling though - there is no improvement relative to the first multi-group model.

At this point, let us inspect Bayes-LOO-ELPD.

We clearly see that the Ordered Categorical 1D latent factor models explain the data better than all other models. Considering the standard error around ELPD differences between models, we also see that none of the 1D latent factor models is clearly better than any of the other two in terms of ELPD.

# Model 1 - Poisson no participant effects
# Model 2 - Poisson with participant random effects
# Model 3 - Textbook Ordered Cat, no participant effects
# Model 4 - Multi-group Ordered Cat, no participant effects
# Model 5 - Multi-group Ordered Cat, with participant random effects
# Model 6 - Textbook Ordered Cat with response correlations, no participant effects
# Model 7 - Multi-group Ordered Cat with response correlations, no participant effects
# Model 8 - Textbook 1D Latent factor model
# Model 9 - First Multi-group 1D Latent factor model
# Model 10 - Second Multi-group 1D Latent factor model
comp <- loo::loo_compare(poisson_log_likert_fit_m0_loo,
                         poisson_log_likert_fit_m1_loo,
                         ordered_categorical_logit_fit_m0a_loo,
                         ordered_categorical_logit_fit_m0_loo, 
                         ordered_categorical_logit_fit_loo,
                         ordered_categorical_logit_fit_m4a_loo,
                         ordered_categorical_logit_fit_m4_loo,
                         ordered_categorical_logit_fit_m5a_loo,
                         ordered_categorical_logit_fit_m5_loo,
                         ordered_categorical_logit_fit_m5b_loo
                         )
print(comp, simplify = FALSE)
##         elpd_diff se_diff elpd_loo se_elpd_loo p_loo   se_p_loo looic  
## model8      0.0       0.0 -1320.3     18.8       115.8     3.6   2640.6
## model9     -3.5       1.8 -1323.8     19.1       121.7     3.7   2647.5
## model10   -11.0       6.7 -1331.3     19.4       147.3     5.1   2662.6
## model5    -31.9       8.0 -1352.3     18.4       115.5     3.1   2704.5
## model6   -125.1      15.8 -1445.4     12.3         8.8     0.2   2890.7
## model7   -129.4      15.8 -1449.7     12.6        15.0     0.3   2899.3
## model3   -136.8      16.2 -1457.1     11.3         6.0     0.1   2914.2
## model4   -140.8      16.2 -1461.1     11.6        12.0     0.2   2922.2
## model2   -232.7      16.9 -1553.0     12.8        35.0     1.1   3105.9
## model1   -267.7      19.7 -1588.0     13.1         2.7     0.1   3176.1
##         se_looic
## model8     37.7 
## model9     38.1 
## model10    38.8 
## model5     36.9 
## model6     24.7 
## model7     25.2 
## model3     22.6 
## model4     23.2 
## model2     25.7 
## model1     26.3

6 VAC - Ordered Cat model

Let us move to the VAC data, and repeat the main steps above. Here are the survey responses to each of the 10 items targeting latent behavior ‘violence against children (VAC)’ across participants:

Polychoric correclations among item responses in the same group are again very high:

tmp <- data.table:::dcast( dcat3, time_label + arm_label + pid ~ variable, value.var = 'value')
tmp <- melt(tmp, id.vars = c('time_label','arm_label','pid','VACE_threat'))
tmp[,
    {
      z <- polycor::polychor(VACE_threat, value, std.err = TRUE, ML = FALSE)
      list(polchor = z$rho, polychor_sd = sqrt(as.numeric(z$var))) 
    }, 
    by = c('arm_label','time_label')
    ]
##       arm_label time_label   polchor polychor_sd
##          <char>     <char>     <num>       <num>
## 1:      Control   Baseline 0.6894082  0.04315532
## 2: Intervention   Baseline 0.4667370  0.05385451
## 3:      Control    Endline 0.6516906  0.04053380
## 4: Intervention    Endline 0.8495877  0.02264638

6.1 Flexible Multi-Group Ordered Categorial Model without participant effects

Let’s fit the flexible Multi-Group Model without participant effects first.

# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat3)
stan_data$P <- max(dcat3$group)
stan_data$K <- length(unique(dcat3$value))
stan_data$y <- dcat3$value
stan_data$group_of_obs <- dcat3$group

# sample
vac_ordered_categorical_logit_fit_m0 <- ordered_categorical_logit_compiled_m0$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 finished in 25.8 seconds.
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 1 finished in 26.0 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 25.9 seconds.
## Total execution time: 26.1 seconds.
# save output to RDS
vac_ordered_categorical_logit_fit_m0$save_object(file = file.path(dir.out, paste0(file.prefix,"m0_stan.rds")))

This is the model fit (in black) relative to the aggregated survey items. The model is flexible enough to capture the proportions for each Likert outcome and each group exactly.

Below is the model fit (in black) relative to each survey item. Again, there remains unexplained heterogeneity across each survey item, and we consider this okay because each question elicits the underlying target behaviour in a slightly different way. We note that the outcome frequencies are bimodal and increasing for the ‘Nearly every day’ category, particularly in the Control group. The Categorical model can match these trends, and it is already clear that a Poisson model will fail in matching these trends.

6.2 Multi-Group Ordered Categorical Model with participant effects

Let’s now fit theMulti-Group Model with participant effects.

# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat3)
stan_data$P <- max(dcat3$group)
stan_data$U <- max(dcat3$pid)
stan_data$K <- length(unique(dcat3$value))
stan_data$y <- dcat3$value
stan_data$group_of_obs <- dcat3$group
stan_data$unit_of_obs <- dcat3$pid

# sample
vac_ordered_categorical_logit_fit <- ordered_categorical_logit_compiled_m1$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 finished in 58.6 seconds.
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 1 finished in 59.7 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 59.1 seconds.
## Total execution time: 59.9 seconds.
# save output to RDS
vac_ordered_categorical_logit_fit$save_object(file = file.path(dir.out, paste0(file.prefix,"m1_stan.rds")))

We see once more distinct participant-level effects that define participant-level latent factors:

This is the model fit (in black) relative to the aggregated survey items. In this model, the participant level effects systematically influence the group-level outcome probabilities.

Here is a visualisation of the estimated polychoric correlations. We can clearly see that with participant effects, the model is already able to capture correlations between item responses.

7 VAC - Poisson model

I will next consider the VAC Likert outcomes as if they were counts and aggregate these across all itemized questions that target the same behaviour.

7.1 Model without participant effects

Let us fit the Poisson model.

This is the model fit to the aggregated Likert outcomes when considered as counts. With 10 questions targeting the same behaviour, it is immediately clear that the Poisson model provides an unacceptably poor fit to the aggregated data.

7.2 On Likert data for model comparison

I will now again consider the raw data for each survey question, but instead of interpreting these data as Likert scale outcomes, I will interpret these data as counts, and model these with a Poisson distribution.

I will start again with the model without participant effects.

We can see that the Poisson fit always follows a unimodal shape that does not capture the increasing trends in frequencies of the 2 and 3 category, particularly in the Control group.

7.3 Model with participant effects

This is the fit of the Poisson model with participant effects. Model accuracy remains essentially unchanged.

8 VAC - Ordered Cat Latent Factor model

file.prefix <- 'vac_ordered_categorical_'

# define data in format needed for model specification
stan_data <- list()
stan_data$N <- nrow(dcat3)
stan_data$P <- max(dcat3$group)
stan_data$Q <- max(dcat3$vid)
stan_data$U <- max(dcat3$pid)
stan_data$K <- length(unique(dcat3$value))
stan_data$y <- dcat3$value
stan_data$group_of_obs <- dcat3$group
stan_data$question_of_obs <- dcat3$vid
stan_data$unit_of_obs <- dcat3$pid

# sample
vac_ordered_categorical_logit_fit_m5 <- ordered_categorical_logit_compiled_m5$sample(
  data = stan_data,
  seed = 123,
  chains = 2,
  parallel_chains = 2,
  iter_warmup = 5e2,
  iter_sampling = 15e2,
  refresh = 500,
  save_warmup = TRUE
)
## Running MCMC with 2 parallel chains...
## 
## Chain 2 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 1 Iteration:    1 / 2000 [  0%]  (Warmup) 
## Chain 1 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 1 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 2 Iteration:  500 / 2000 [ 25%]  (Warmup) 
## Chain 2 Iteration:  501 / 2000 [ 25%]  (Sampling) 
## Chain 1 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 2 Iteration: 1000 / 2000 [ 50%]  (Sampling) 
## Chain 1 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 2 Iteration: 1500 / 2000 [ 75%]  (Sampling) 
## Chain 1 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 1 finished in 106.7 seconds.
## Chain 2 Iteration: 2000 / 2000 [100%]  (Sampling) 
## Chain 2 finished in 107.8 seconds.
## 
## Both chains finished successfully.
## Mean chain execution time: 107.3 seconds.
## Total execution time: 108.1 seconds.
# save output to RDS
vac_ordered_categorical_logit_fit_m5$save_object(file = file.path(dir.out, paste0(file.prefix,"m5_stan.rds")))

Here are the participant effects around the group mean latent factors, showing similar behaviour as before. This again demonstrates that (1) the added baseline response effects are not able to explain the data without latent traits, and (2) there is substantial individual-level heterogeneity around the group mean latent factors.

Here is the polychoric correlation plot:

Here is the model fit (in black) relative to the aggregated survey items. Hoorary, we have a good fit relative to our primary target statistics:

Here is the model fit (in black) relative to each survey item. We don’t achieve a super good fit, though perhaps fine.

Here is a visualisation of the estimated polychoric correlations. We can clearly see how the model is inducing some correlations between item responses:

9 VAC - Compare models

Here is the model comparison in terms of Bayesian leave-one-out (Bayes-LOO) predictive performance measured by expected log posterior density (ELPD):

We see that the 1D latent factor model is the best performing model.

# Model 1 - Poisson no participant effects
# Model 2 - Poisson with participant random effects
# Model 4 - Multi-group Ordered Cat, no participant effects
# Model 5 - Multi-group Ordered Cat, with participant random effects
# Model 9 - First Multi-group 1D Latent factor model
comp <- loo::loo_compare(vac_poisson_log_likert_fit_m0_loo,
                         vac_poisson_log_likert_fit_m1_loo,
                         vac_ordered_categorical_logit_fit_m0_loo, 
                         vac_ordered_categorical_logit_fit_loo,
                         vac_ordered_categorical_logit_fit_m5_loo
                         )

print(comp, simplify = FALSE)
##        elpd_diff se_diff elpd_loo se_elpd_loo p_loo   se_p_loo looic   se_looic
## model5     0.0       0.0 -2067.8     47.0       139.6     5.1   4135.6    94.0 
## model4  -194.8      20.3 -2262.6     48.5       126.7     3.8   4525.2    97.0 
## model3  -555.3      28.3 -2623.1     46.2        12.0     0.3   5246.2    92.3 
## model2  -715.5      38.0 -2783.3     57.1       169.8     7.3   5566.5   114.1 
## model1 -1295.9      49.2 -3363.7     54.1         7.2     0.2   6727.4   108.1

10 Final analysis

10.1 Primary endpoints

The primary endpoint in this pre-post study is comparing the average change \(\Delta_{g,f}\) in negative outcomes in the latent behavioral factors \(f\) (parental mental health, violence against children) that occurr ‘Nearly every day’ between Baseline and Endline among the Intervention and Control groups \(g \in \{I, C\}\), respectively. We will quantify the corresponding risk differences, \[\begin{equation} \text{RD}(f) = \Delta_{I,f} - \Delta_{C,f}, \end{equation}\] for both the latent parental mental health and violence against children behaviours. Here, the average change \(\Delta_{g,f}\) in the intervention and control groups is defined as \[\begin{equation} \Delta_{g,f} = \frac{1}{N_g}\sum_{i \in g} \frac{1}{J}\sum_j p^{f, t=0}_{i,j,k=4} - p^{f, t=1}_{i,k=4}, \end{equation}\] where \(p^{f, t=1}_{i,j,k=4}\) describes the probability of a ‘Nearly every day’ response (\(k=4\)) to survey item \(j\) of participant \(i\) at time \(t\).

# process Monte Carlo outputs

# process VAC
po <- vac_ordered_categorical_logit_fit_m5$draws(
  variables = c('ordered_prob_by_obs'),
  inc_warmup = FALSE,
  format = "draws_df"
  )
po <- as.data.table(po)
po <- data.table::melt(po, id.vars = c('.draw','.chain','.iteration'), value.name = 'prob')
po[, value := as.integer(gsub('ordered_prob_by_obs\\[([0-9]),([0-9]+)\\]','\\1',variable)) ]
po[, oid := as.integer(gsub('ordered_prob_by_obs\\[([0-9]),([0-9]+)\\]','\\2',variable)) ]
set(po, NULL, 'variable', NULL)
tmp <- unique(subset(dcat3, select = c(oid, pid, vid, group, arm, arm_label, time, time_label )))
po <- merge(po, tmp, by = c('oid'))
po <- subset(po, value == 4)
po <- data.table::dcast(po, .draw + pid + vid + arm + arm_label ~ time_label, value.var = 'prob')
po[, diff := Baseline - Endline]
po[, ratio := 1 - Endline / Baseline]
po <- po[, list(diff = mean(diff), ratio = mean(ratio)), by = c('.draw','arm','arm_label')]
po[, trait_label := 'Violence Against Children']
po.vac <- copy(po)

# process PHQ
po <- ordered_categorical_logit_fit_m5$draws(
  variables = c('ordered_prob_by_obs'),
  inc_warmup = FALSE,
  format = "draws_df"
  )
po <- as.data.table(po)
po <- data.table::melt(po, id.vars = c('.draw','.chain','.iteration'), value.name = 'prob')
po[, value := as.integer(gsub('ordered_prob_by_obs\\[([0-9]),([0-9]+)\\]','\\1',variable)) ]
po[, oid := as.integer(gsub('ordered_prob_by_obs\\[([0-9]),([0-9]+)\\]','\\2',variable)) ]
set(po, NULL, 'variable', NULL)
tmp <- unique(subset(dcat1, select = c(oid, pid, vid, group, arm, arm_label, time, time_label )))
po <- merge(po, tmp, by = c('oid'))
po <- subset(po, value == 4)
po <- data.table::dcast(po, .draw + pid + vid + arm + arm_label ~ time_label, value.var = 'prob')
po[, diff := Baseline - Endline]
po[, ratio := 1 - Endline / Baseline]
po <- po[, list(diff = mean(diff), ratio = mean(ratio)), by = c('.draw','arm','arm_label')]
po[, trait_label := 'Parental Mental Health']
po.pmh <- copy(po)

I first plot the average changes Endline versus Outline among participants in the Intervention and Control group:

And here are the estimates of the primary endpoint for both Parental Mental Health and Violence Against Children:

## Key: <variable, trait_label>
##    variable               trait_label  iqr_lower  iqr_upper     median
##      <fctr>                    <char>      <num>      <num>      <num>
## 1:     diff    Parental Mental Health 0.18993899 0.24958385 0.22122330
## 2:     diff Violence Against Children 0.03737128 0.06576752 0.05226997
## 3:    ratio    Parental Mental Health 0.68567675 0.78107169 0.73881688
## 4:    ratio Violence Against Children 0.49241655 0.63930980 0.57018318
##       q_lower    q_upper                 label
##         <num>      <num>                <char>
## 1: 0.13263197 0.30254308 22.1% (13.3% - 30.3%)
## 2: 0.01330839 0.09324889    5.2% (1.3% - 9.3%)
## 3: 0.55801615 0.84646304 73.9% (55.8% - 84.6%)
## 4: 0.30843398 0.74264103   57% (30.8% - 74.3%)